1 /*
2 * Copyright (C) 2009 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package com.google.common.xml;
18
19 import com.google.common.annotations.Beta;
20 import com.google.common.annotations.GwtCompatible;
21 import com.google.common.escape.Escaper;
22 import com.google.common.escape.Escapers;
23
24 /**
25 * {@code Escaper} instances suitable for strings to be included in XML
26 * attribute values and elements' text contents. When possible, avoid manual
27 * escaping by using templating systems and high-level APIs that provide
28 * autoescaping. For example, consider <a href="http://www.xom.nu/">XOM</a> or
29 * <a href="http://www.jdom.org/">JDOM</a>.
30 *
31 * <p><b>Note:</b> Currently the escapers provided by this class do not escape
32 * any characters outside the ASCII character range. Unlike HTML escaping the
33 * XML escapers will not escape non-ASCII characters to their numeric entity
34 * replacements. These XML escapers provide the minimal level of escaping to
35 * ensure that the output can be safely included in a Unicode XML document.
36 *
37 *
38 * <p>For details on the behavior of the escapers in this class, see sections
39 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and
40 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
41 * XML specification.
42 *
43 * @author Alex Matevossian
44 * @author David Beaumont
45 * @since 15.0
46 */
47 @Beta
48 @GwtCompatible
49 public class XmlEscapers {
50 private XmlEscapers() {}
51
52 private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
53 private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
54
55 // For each xxxEscaper() method, please add links to external reference pages
56 // that are considered authoritative for the behavior of that escaper.
57
58 /**
59 * Returns an {@link Escaper} instance that escapes special characters in a
60 * string so it can safely be included in an XML document as element content.
61 * See section
62 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
63 * XML specification.
64 *
65 * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not
66 * safe</b> to use this escaper to escape attribute values. Use
67 * {@link #xmlContentEscaper} if the output can appear in element content or
68 * {@link #xmlAttributeEscaper} in attribute values.
69 *
70 * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control
71 * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
72 * are not permitted in XML. For more detail see section <a
73 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the
74 * XML specification.
75 *
76 * <p>This escaper does not escape non-ASCII characters to their numeric
77 * character references (NCR). Any non-ASCII characters appearing in the input
78 * will be preserved in the output. Specifically "\r" (carriage return) is
79 * preserved in the output, which may result in it being silently converted to
80 * "\n" when the XML is parsed.
81 *
82 * <p>This escaper does not treat surrogate pairs specially and does not
83 * perform Unicode validation on its input.
84 */
85 public static Escaper xmlContentEscaper() {
86 return XML_CONTENT_ESCAPER;
87 }
88
89 /**
90 * Returns an {@link Escaper} instance that escapes special characters in a
91 * string so it can safely be included in XML document as an attribute value.
92 * See section
93 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a>
94 * of the XML specification.
95 *
96 * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control
97 * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
98 * are not permitted in XML. For more detail see section <a
99 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the
100 * XML specification.
101 *
102 * <p>This escaper does not escape non-ASCII characters to their numeric
103 * character references (NCR). However, horizontal tab {@code '\t'}, line feed
104 * {@code '\n'} and carriage return {@code '\r'} are escaped to a
105 * corresponding NCR {@code "	"}, {@code "
"}, and {@code "
"}
106 * respectively. Any other non-ASCII characters appearing in the input will
107 * be preserved in the output.
108 *
109 * <p>This escaper does not treat surrogate pairs specially and does not
110 * perform Unicode validation on its input.
111 */
112 public static Escaper xmlAttributeEscaper() {
113 return XML_ATTRIBUTE_ESCAPER;
114 }
115
116 private static final Escaper XML_ESCAPER;
117 private static final Escaper XML_CONTENT_ESCAPER;
118 private static final Escaper XML_ATTRIBUTE_ESCAPER;
119 static {
120 Escapers.Builder builder = Escapers.builder();
121 // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
122 // (Unicode code points above \uFFFF are represented via surrogate pairs
123 // which means they are treated as pairs of safe characters).
124 builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
125 // Unsafe characters are replaced with the Unicode replacement character.
126 builder.setUnsafeReplacement("\uFFFD");
127
128 /*
129 * Except for \n, \t, and \r, all ASCII control characters are replaced with
130 * the Unicode replacement character.
131 *
132 * Implementation note: An alternative to the following would be to make a
133 * map that simply replaces the allowed ASCII whitespace characters with
134 * themselves and to set the minimum safe character to 0x20. However this
135 * would slow down the escaping of simple strings that contain \t, \n, or
136 * \r.
137 */
138 for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
139 if (c != '\t' && c != '\n' && c != '\r') {
140 builder.addEscape(c, "\uFFFD");
141 }
142 }
143
144 // Build the content escaper first and then add quote escaping for the
145 // general escaper.
146 builder.addEscape('&', "&");
147 builder.addEscape('<', "<");
148 builder.addEscape('>', ">");
149 XML_CONTENT_ESCAPER = builder.build();
150 builder.addEscape('\'', "'");
151 builder.addEscape('"', """);
152 XML_ESCAPER = builder.build();
153 builder.addEscape('\t', "	");
154 builder.addEscape('\n', "
");
155 builder.addEscape('\r', "
");
156 XML_ATTRIBUTE_ESCAPER = builder.build();
157 }
158 }